clean_names %>%
filter(sploc != 0) %>%
# The criteria from DQYDJ.
# filter(age >= 16, classwkr >= 20, classwkr <= 28, wkswork1 >= 40, workly == 2) %>%
filter(marst == 1, age %in% 25:65, wkswork1 >= 20, classwkr %in% 20:28) %>%
# The criteria from Feb.
# filter(nchild == 0, marst == 1, empstat == 10) %>%
# Filter out top-coded individuals if they are there.
# filter(incwage < 10e6) %>%
# filter(nchild == 0, marst %in% c(1), classwkr %in% 10:28) %>%
group_by(serial, sploc, pernum) %>%
mutate(pair = paste0(serial, "_", min(pernum, sploc), "-", max(pernum, sploc))) %>%
ungroup() %>%
group_by(pair) %>%
mutate(len = length(pair)) %>%
filter(len > 1) %>%
# Construct empirical data from filtered data.
rename(wt = asecwt) %>%
arrange(incwage) %>%
ungroup() %>%
mutate(pdf = wt / sum(wt),
cdf = cumsum(pdf),
percentile = ceiling(100 * cdf)) %>%
select(wt, pair, incwage, sex, pdf, cdf, percentile)
View(df)
df_temp <- raw_2021 %>%
clean_names %>%
filter(sploc != 0) %>%
# The criteria from DQYDJ.
# filter(age >= 16, classwkr >= 20, classwkr <= 28, wkswork1 >= 40, workly == 2) %>%
filter(marst == 1, age %in% 25:65, wkswork1 >= 20, classwkr %in% 20:28) %>%
# The criteria from Feb.
# filter(nchild == 0, marst == 1, empstat == 10) %>%
# Filter out top-coded individuals if they are there.
# filter(incwage < 10e6) %>%
# filter(nchild == 0, marst %in% c(1), classwkr %in% 10:28) %>%
group_by(serial, sploc, pernum) %>%
mutate(pair = paste0(serial, "_", min(pernum, sploc), "-", max(pernum, sploc))) %>%
ungroup() %>%
group_by(pair) %>%
mutate(len = length(pair)) %>%
filter(len > 1) %>%
# Construct empirical data from filtered data.
rename(wt = asecwt) %>%
arrange(incwage) %>%
ungroup() %>%
mutate(pdf = wt / sum(wt),
cdf = cumsum(pdf),
percentile = ceiling(100 * cdf)) %>%
select(wt, pair, incwage, sex, pdf, cdf, percentile)
View(df_temp)
df <- raw_2021 %>%
clean_names %>%
filter(sploc != 0) %>%
# The criteria from DQYDJ.
# filter(age >= 16, classwkr >= 20, classwkr <= 28, wkswork1 >= 40, workly == 2) %>%
filter(marst == 1, age %in% 25:65, wkswork1 >= 20, classwkr %in% 20:28) %>%
# The criteria from Feb.
# filter(nchild == 0, marst == 1, empstat == 10) %>%
# Filter out top-coded individuals if they are there.
# filter(incwage < 10e6) %>%
# filter(nchild == 0, marst %in% c(1), classwkr %in% 10:28) %>%
group_by(serial, sploc, pernum) %>%
mutate(pair = paste0(serial, "_", min(pernum, sploc), "-", max(pernum, sploc))) %>%
ungroup() %>%
group_by(pair) %>%
mutate(len = length(pair)) %>%
filter(len > 1) %>%
# Construct empirical data from filtered data.
rename(wt = asecwt) %>%
arrange(incwage) %>%
ungroup() %>%
mutate(pdf = wt / sum(wt),
cdf = cumsum(pdf),
percentile = ceiling(100 * cdf)) %>%
select(wt, pair, incwage, sex, pdf, cdf, percentile) %>%
# Compute statistics within percentiles.
group_by(percentile) %>%
mutate(earnings_threshold_percentile = min(incwage),
pdf_percentile = pdf / sum(pdf),
mean_incwage_percentile = sum(pdf_percentile * incwage)) %>%
ungroup()
View(df)
View(df_temp)
df_temp$percentile
df_temp$percentile[-10:end]
df_temp$percentile[-10:]
df_temp$percentile[1:]
df_temp$percentile[1: ]
df_temp$percentile
df_temp$percentile[-(1:e)]
df_temp$percentile[-(1:3)]
df_temp$percentile[-1]
df_temp$percentile[-3]
df_temp$percentile
as.vector(df_temp$percentile)
as.vector(df_temp$percentile)[-(1:e)]
as.vector(df_temp$percentile)[-(1:3)]
as.vector(df_temp$percentile)[-(1:10)]
as.vector(df_temp$percentile)[1:10]
as.vector(df_temp$percentile)[-10]
as.vector(df_temp$percentile)[10]
nrow(df_temp)
df_temp$percentile[nrow(df_temp)]
sum(df_temp$percentile > 100)
df_temp$percentile[df_temp$percentile > 100] = 100
View(df_temp)
df_temp <- df_temp %>%
group_by(percentile) %>%
mutate(earnings_threshold_percentile = min(incwage),
pdf_percentile = pdf / sum(pdf),
mean_incwage_percentile = sum(pdf_percentile * incwage)) %>%
ungroup()
View(df_temp)
# Filter data for couples as per a custom methodology.
df <- raw_2021 %>%
clean_names %>%
filter(sploc != 0) %>%
# The criteria from DQYDJ.
# filter(age >= 16, classwkr >= 20, classwkr <= 28, wkswork1 >= 40, workly == 2) %>%
filter(marst == 1, age %in% 25:65, wkswork1 >= 20, classwkr %in% 20:28) %>%
# The criteria from Feb.
# filter(nchild == 0, marst == 1, empstat == 10) %>%
# Filter out top-coded individuals if they are there.
# filter(incwage < 10e6) %>%
# filter(nchild == 0, marst %in% c(1), classwkr %in% 10:28) %>%
group_by(serial, sploc, pernum) %>%
mutate(pair = paste0(serial, "_", min(pernum, sploc), "-", max(pernum, sploc))) %>%
ungroup() %>%
group_by(pair) %>%
mutate(len = length(pair)) %>%
filter(len > 1) %>%
# Construct empirical data from filtered data.
rename(wt = asecwt) %>%
arrange(incwage) %>%
ungroup() %>%
mutate(pdf = wt / sum(wt),
cdf = cumsum(pdf),
percentile = ceiling(100 * cdf))
# Adjust so no percentile is over 100
df$percentile[df$percentile > 100] = 100
# Compute statistics within percentiles
df <- df %>%
select(wt, pair, incwage, sex, pdf, cdf, percentile) %>%
group_by(percentile) %>%
mutate(earnings_threshold_percentile = min(incwage),
pdf_percentile = pdf / sum(pdf),
mean_incwage_percentile = sum(pdf_percentile * incwage)) %>%
ungroup()
for (col in colnames(df)) {
v = as.vector(df[[col]])
if (class(v) == "numeric") {
val = all(abs(v - df_fromfolder[[col]]) < 0.01)
n = sum(abs(v - df_fromfolder[[col]]) >= 0.01)
} else {
val = all(v == df_fromfolder[[col]])
n = all(v != df_fromfolder[[col]])
}
if (!val) {
print(col)
print(n)
# print(class(df[[col]]))
}
}
tempdf = data.frame(
new = df[["mean_incwage_percentile"]],
old = df_fromfolder[["mean_incwage_percentile"]]
)
ggplot(tempdf) + geom_point(aes(new, old)) + geom_line(aes(new, new, color = "45-degree"))
idx = abs(tempdf$new - tempdf$old) > 0.01
View(df[idx, ])
for (col in colnames(df)) {
v = as.vector(df[[col]])
if (class(v) == "numeric") {
val = all(abs(v - df_fromfolder[[col]]) < 0.01)
n = sum(abs(v - df_fromfolder[[col]]) >= 0.01)
} else {
val = all(v == df_fromfolder[[col]])
n = all(v != df_fromfolder[[col]])
}
if (!val) {
print(col)
print(n)
# print(class(df[[col]]))
}
}
# Read in data from DQYDJ website and CPS original data.
if (!require("ipumsr")) stop("Reading IPUMS data into R requires the ipumsr package. It can be installed using the following command: install.packages('ipumsr')")
library(wCorr)
library(tidyr)
library(dplyr)
library(janitor)
library(readr)
library(ggplot2)
library(DescTools)
# Read in raw data.
data_name <- "cps_00012.xml"
raw_2021 <- read_ipums_ddi(data_name) %>% read_ipums_micro
# # Filter data for couples as per a custom methodology.
# df <- raw_2021 %>%
#   clean_names %>%
#   filter(sploc != 0) %>%
#   # The criteria from DQYDJ.
#   # filter(age >= 16, classwkr >= 20, classwkr <= 28, wkswork1 >= 40, workly == 2) %>%
#   filter(marst == 1, age %in% 25:65, wkswork1 >= 20, classwkr %in% 20:28) %>%
#   # The criteria from Feb.
#   # filter(nchild == 0, marst == 1, empstat == 10) %>%
#   # Filter out top-coded individuals if they are there.
#   # filter(incwage < 10e6) %>%
#   # filter(nchild == 0, marst %in% c(1), classwkr %in% 10:28) %>%
#   group_by(serial, sploc, pernum) %>%
#   mutate(pair = paste0(serial, "_", min(pernum, sploc), "-", max(pernum, sploc))) %>%
#   ungroup() %>%
#   group_by(pair) %>%
#   mutate(len = length(pair)) %>%
#   filter(len > 1) %>%
#   # Construct empirical data from filtered data.
#   rename(wt = asecwt) %>%
#   arrange(incwage) %>%
#   ungroup() %>%
#   mutate(pdf = wt / sum(wt),
#          cdf = cumsum(pdf),
#          percentile = ceiling(100 * cdf)) %>%
#   select(wt, pair, incwage, sex, pdf, cdf, percentile) %>%
#   # Compute statistics within percentiles.
#   group_by(percentile) %>%
#   mutate(earnings_threshold_percentile = min(incwage),
#          pdf_percentile = pdf / sum(pdf),
#          mean_incwage_percentile = sum(pdf_percentile * incwage)) %>%
#   ungroup()
# Filter data for couples as per a custom methodology.
df <- raw_2021 %>%
clean_names %>%
filter(sploc != 0) %>%
# The criteria from DQYDJ.
# filter(age >= 16, classwkr >= 20, classwkr <= 28, wkswork1 >= 40, workly == 2) %>%
filter(marst == 1, age %in% 25:65, wkswork1 >= 20, classwkr %in% 20:28) %>%
# The criteria from Feb.
# filter(nchild == 0, marst == 1, empstat == 10) %>%
# Filter out top-coded individuals if they are there.
# filter(incwage < 10e6) %>%
# filter(nchild == 0, marst %in% c(1), classwkr %in% 10:28) %>%
group_by(serial, sploc, pernum) %>%
mutate(pair = paste0(serial, "_", min(pernum, sploc), "-", max(pernum, sploc))) %>%
ungroup() %>%
group_by(pair) %>%
mutate(len = length(pair)) %>%
filter(len > 1) %>%
# Construct empirical data from filtered data.
rename(wt = asecwt) %>%
arrange(incwage) %>%
ungroup() %>%
mutate(pdf = wt / sum(wt),
cdf = cumsum(pdf),
percentile = ceiling(100 * cdf))
# Adjust so no percentile is over 100
df$percentile[df$percentile > 100] = 100
# Compute statistics within percentiles
df <- df %>%
select(wt, pair, incwage, sex, pdf, cdf, percentile) %>%
group_by(percentile) %>%
mutate(earnings_threshold_percentile = min(incwage),
pdf_percentile = pdf / sum(pdf),
mean_incwage_percentile = sum(pdf_percentile * incwage)) %>%
ungroup()
df_fromfolder = read_csv("sample.csv")
for (col in colnames(df)) {
v = as.vector(df[[col]])
if (class(v) == "numeric") {
val = all(abs(v - df_fromfolder[[col]]) < 0.01)
n = sum(abs(v - df_fromfolder[[col]]) >= 0.01)
} else {
val = all(v == df_fromfolder[[col]])
n = all(v != df_fromfolder[[col]])
}
if (!val) {
print(col)
print(n)
# print(class(df[[col]]))
}
}
View(df)
# Read in data from DQYDJ website and CPS original data.
if (!require("ipumsr")) stop("Reading IPUMS data into R requires the ipumsr package. It can be installed using the following command: install.packages('ipumsr')")
library(wCorr)
library(tidyr)
library(dplyr)
library(janitor)
library(readr)
library(ggplot2)
library(DescTools)
# Read in raw data.
data_name <- "cps_00012.xml"
raw_2021 <- read_ipums_ddi(data_name) %>% read_ipums_micro
# # Filter data for couples as per a custom methodology.
# df <- raw_2021 %>%
#   clean_names %>%
#   filter(sploc != 0) %>%
#   # The criteria from DQYDJ.
#   # filter(age >= 16, classwkr >= 20, classwkr <= 28, wkswork1 >= 40, workly == 2) %>%
#   filter(marst == 1, age %in% 25:65, wkswork1 >= 20, classwkr %in% 20:28) %>%
#   # The criteria from Feb.
#   # filter(nchild == 0, marst == 1, empstat == 10) %>%
#   # Filter out top-coded individuals if they are there.
#   # filter(incwage < 10e6) %>%
#   # filter(nchild == 0, marst %in% c(1), classwkr %in% 10:28) %>%
#   group_by(serial, sploc, pernum) %>%
#   mutate(pair = paste0(serial, "_", min(pernum, sploc), "-", max(pernum, sploc))) %>%
#   ungroup() %>%
#   group_by(pair) %>%
#   mutate(len = length(pair)) %>%
#   filter(len > 1) %>%
#   # Construct empirical data from filtered data.
#   rename(wt = asecwt) %>%
#   arrange(incwage) %>%
#   ungroup() %>%
#   mutate(pdf = wt / sum(wt),
#          cdf = cumsum(pdf),
#          percentile = ceiling(100 * cdf)) %>%
#   select(wt, pair, incwage, sex, pdf, cdf, percentile) %>%
#   # Compute statistics within percentiles.
#   group_by(percentile) %>%
#   mutate(earnings_threshold_percentile = min(incwage),
#          pdf_percentile = pdf / sum(pdf),
#          mean_incwage_percentile = sum(pdf_percentile * incwage)) %>%
#   ungroup()
# Filter data for couples as per a custom methodology.
df <- raw_2021 %>%
clean_names %>%
filter(sploc != 0) %>%
# The criteria from DQYDJ.
# filter(age >= 16, classwkr >= 20, classwkr <= 28, wkswork1 >= 40, workly == 2) %>%
filter(marst == 1, age %in% 25:65, wkswork1 >= 20, classwkr %in% 20:28) %>%
# The criteria from Feb.
# filter(nchild == 0, marst == 1, empstat == 10) %>%
# Filter out top-coded individuals if they are there.
# filter(incwage < 10e6) %>%
# filter(nchild == 0, marst %in% c(1), classwkr %in% 10:28) %>%
group_by(serial, sploc, pernum) %>%
mutate(pair = paste0(serial, "_", min(pernum, sploc), "-", max(pernum, sploc))) %>%
ungroup() %>%
group_by(pair) %>%
mutate(len = length(pair)) %>%
filter(len > 1) %>%
# Construct empirical data from filtered data.
rename(wt = asecwt) %>%
arrange(incwage) %>%
ungroup() %>%
mutate(pdf = wt / sum(wt),
cdf = cumsum(pdf),
percentile = ceiling(100 * cdf))
# Adjust so no percentile is over 100
df$percentile[df$percentile > 100] = 100
# Compute statistics within percentiles
df <- df %>%
select(wt, pair, incwage, sex, pdf, cdf, percentile) %>%
group_by(percentile) %>%
mutate(earnings_threshold_percentile = min(incwage),
pdf_percentile = pdf / sum(pdf),
mean_incwage_percentile = sum(pdf_percentile * incwage)) %>%
ungroup()
write_csv(df, 'sample.csv')
df = read_csv("sample.csv")
df_old = read_csv("sample_old.csv")
for (col in colnames(df)) {
print(col)
print(all(df[[col]] == df_old[[col]]))
}
for (col in colnames(df)) {
print(col)
v = as.vector(df[[col]])
if (class(v) == "numeric") {
print(all(df[[col]] == df_old[[col]]))
}
}
for (col in colnames(df)) {
print(col)
v = as.vector(df[[col]])
vold = as.vector(df_old[[col]])
if (class(v) == "numeric") {
print(all(abs(v - vold) < 0.01))
}
}
for (col in colnames(df)) {
print(col)
v = as.vector(df[[col]])
vold = as.vector(df_old[[col]])
if (class(v) == "numeric") {
print(all(abs(v - vold) < 0.001))
}
}
# Read in data from DQYDJ website and CPS original data.
if (!require("ipumsr")) stop("Reading IPUMS data into R requires the ipumsr package. It can be installed using the following command: install.packages('ipumsr')")
library(wCorr)
library(tidyr)
library(dplyr)
library(janitor)
library(readr)
library(ggplot2)
library(DescTools)
# Read in raw data.
data_name <- "cps_00012.xml"
raw_2021 <- read_ipums_ddi(data_name) %>% read_ipums_micro
# # Filter data for couples as per a custom methodology.
# df <- raw_2021 %>%
#   clean_names %>%
#   filter(sploc != 0) %>%
#   # The criteria from DQYDJ.
#   # filter(age >= 16, classwkr >= 20, classwkr <= 28, wkswork1 >= 40, workly == 2) %>%
#   filter(marst == 1, age %in% 25:65, wkswork1 >= 20, classwkr %in% 20:28) %>%
#   # The criteria from Feb.
#   # filter(nchild == 0, marst == 1, empstat == 10) %>%
#   # Filter out top-coded individuals if they are there.
#   # filter(incwage < 10e6) %>%
#   # filter(nchild == 0, marst %in% c(1), classwkr %in% 10:28) %>%
#   group_by(serial, sploc, pernum) %>%
#   mutate(pair = paste0(serial, "_", min(pernum, sploc), "-", max(pernum, sploc))) %>%
#   ungroup() %>%
#   group_by(pair) %>%
#   mutate(len = length(pair)) %>%
#   filter(len > 1) %>%
#   # Construct empirical data from filtered data.
#   rename(wt = asecwt) %>%
#   arrange(incwage) %>%
#   ungroup() %>%
#   mutate(pdf = wt / sum(wt),
#          cdf = cumsum(pdf),
#          percentile = ceiling(100 * cdf)) %>%
#   select(wt, pair, incwage, sex, pdf, cdf, percentile) %>%
#   # Compute statistics within percentiles.
#   group_by(percentile) %>%
#   mutate(earnings_threshold_percentile = min(incwage),
#          pdf_percentile = pdf / sum(pdf),
#          mean_incwage_percentile = sum(pdf_percentile * incwage)) %>%
#   ungroup()
# Filter data for couples as per a custom methodology.
df <- raw_2021 %>%
clean_names %>%
filter(sploc != 0) %>%
# The criteria from DQYDJ.
# filter(age >= 16, classwkr >= 20, classwkr <= 28, wkswork1 >= 40, workly == 2) %>%
filter(marst == 1, age %in% 25:65, wkswork1 >= 20, classwkr %in% 20:28) %>%
# The criteria from Feb.
# filter(nchild == 0, marst == 1, empstat == 10) %>%
# Filter out top-coded individuals if they are there.
# filter(incwage < 10e6) %>%
# filter(nchild == 0, marst %in% c(1), classwkr %in% 10:28) %>%
group_by(serial, sploc, pernum) %>%
mutate(pair = paste0(serial, "_", min(pernum, sploc), "-", max(pernum, sploc))) %>%
ungroup() %>%
group_by(pair) %>%
mutate(len = length(pair)) %>%
filter(len > 1) %>%
# Construct empirical data from filtered data.
rename(wt = asecwt) %>%
arrange(incwage) %>%
ungroup() %>%
mutate(pdf = wt / sum(wt),
cdf = cumsum(pdf),
percentile = ceiling(100 * cdf))
# Adjust so no percentile is over 100
df$percentile[df$percentile > 100] = 100
# Compute statistics within percentiles
df <- df %>%
select(wt, pair, incwage, sex, pdf, cdf, percentile) %>%
group_by(percentile) %>%
mutate(earnings_threshold_percentile = min(incwage),
pdf_percentile = pdf / sum(pdf),
mean_incwage_percentile = sum(pdf_percentile * incwage)) %>%
ungroup()
write_csv(df, 'sample.csv')
# Read in data from DQYDJ website and CPS original data.
if (!require("ipumsr")) stop("Reading IPUMS data into R requires the ipumsr package. It can be installed using the following command: install.packages('ipumsr')")
library(wCorr)
library(tidyr)
library(dplyr)
library(janitor)
library(readr)
library(ggplot2)
library(DescTools)
# Read in raw data.
data_name <- "cps_00012.xml"
raw_2021 <- read_ipums_ddi(data_name) %>% read_ipums_micro
# Filter data for couples as per a custom methodology.
df <- raw_2021 %>%
clean_names %>%
filter(sploc != 0) %>%
# The criteria from DQYDJ.
# filter(age >= 16, classwkr >= 20, classwkr <= 28, wkswork1 >= 40, workly == 2) %>%
filter(marst == 1, age %in% 25:65, wkswork1 >= 20, classwkr %in% 20:28) %>%
# The criteria from Feb.
# filter(nchild == 0, marst == 1, empstat == 10) %>%
# Filter out top-coded individuals if they are there.
# filter(incwage < 10e6) %>%
# filter(nchild == 0, marst %in% c(1), classwkr %in% 10:28) %>%
group_by(serial, sploc, pernum) %>%
mutate(pair = paste0(serial, "_", min(pernum, sploc), "-", max(pernum, sploc))) %>%
ungroup() %>%
group_by(pair) %>%
mutate(len = length(pair)) %>%
filter(len > 1) %>%
# Construct empirical data from filtered data.
rename(wt = asecwt) %>%
arrange(incwage) %>%
ungroup() %>%
mutate(pdf = wt / sum(wt),
cdf = cumsum(pdf),
percentile = ceiling(100 * cdf))
df
df[[1]]
View(raw_2021)
colnames(raw_2021)
